<?php

namespace StudyBuddy\String;

use \StudyBuddy\Utf8String;

/**
 * Class for parsing html fragment
 * using DOMDocument class of php
 * and multibyte-safe regex functions
 *
 * All methods of this class are utf-8 safe
 *
 */
class HTMLString extends \StudyBuddy\Dom\Document implements \StudyBuddy\Interfaces\StudyBuddyObject, \Serializable {

    /**
     * Tracker flag to indicate that a method of this class
     * or sub-class added a CDATA node to the tree.
     * This would signal to importCDATA() that
     * this object should be reloaded
     * It's responsibility of implementing sub-class
     * to set this flag to true whenever a method
     * added a CDATA node to the tree
     *
     * @var bool
     */
    protected $bCDATA = false;

    /**
     * Factory method
     * Makes the object is this class
     * and load the html string, first wraps the html
     * string into the <div>
     *
     * @param mixed string | object of type Utf8String $oHtml
     * by being an object of type Utf8String it's guaranteed
     * to be in utf-8 charset
     *
     * @return object of this class
     *
     * @throws \StudyBuddy\DevException if unable to load the string
     */
    public static function factory($s) {

        $oDom = new static();
        $oDom->preserveWhiteSpace = true;
        if (\is_string($s)) {
            $sHtml = $s;
        } elseif ($s instanceof \StudyBuddy\Utf8String) {
            $sHtml = $s->valueOf();
        } else {
            throw new \StudyBuddy\DevException('Input param $s must be string or instance of Utf8String. was: ' . var_export($s, true));
        }

        $ER = error_reporting(0);
        if (false === @$oDom->loadHTMLString($sHtml)) {
            throw new \StudyBuddy\DevException('Error. Unable to load html string: ' . $sHtml);
        }
        error_reporting($ER);
        \mb_regex_encoding('UTF-8');

        return $oDom;
    }

    /**
     * Same as loadHTMLString() only the input
     * is an object of type Utf8String
     *
     * @param \StudyBuddy\Utf8String $oHtml
     */
    public function loadUTF8String(\StudyBuddy\Utf8String $oHtml) {
        $s = $oHtml->valueOf();

        echo __LINE__ . ' ' . $s;
        return $this->loadHTMLString($s);
    }

    /**
     * Load html string into this object
     *
     * @param string $s
     * Must be absolutely sure that this string
     * is in a valid UTF-8 encoding!
     *
     * @return bool true if loadHTML() succeed or false if not
     */
    public function loadHTMLString($s) {
        /**
         * Extremely important to add the
         * <META CONTENT="text/html; charset=utf-8">
         * This is the ONLY way to tell the DOM (more spefically
         * the libxml) that input is in utf-8 encoding
         * Whithout this the DOM will assume that input is in the
         * default ISO-8859-1 format and then
         * will try to recode it to utf8
         * essentially it will do its own conversion to utf8,
         * messing up the string because it's already in utf8 and does not
         * need converting
         *
         * IMPORTANT: we are also wrapping the whole string in <div>
         * so that it will be easy to get back just the contents of
         * the first div
         *
         */
        $s = '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
                      "http://www.w3.org/TR/REC-html40/loose.dtd">
			<head>
  			<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
			</head>
			<body><div>' . $s . '</div></body></html>';

        return $this->loadHTML($s);
    }

    /**
     * Get HTML fragment (the contents of the <body>,
     * without the actual <body> tag
     *
     * @return string HTML string, usually with html special
     * entities (like &nbsp;) replaced with special UTF8 XML entities
     *
     */
    public function getHtml() {

        $s = $this->saveHTML();
        preg_match('/(\<body>\<div>)(.*)(\<\/div>\<\/body>)/sm', $s, $matches);

        if (!is_array($matches) || empty($matches[2])) {
            throw new \StudyBuddy\Exception('unable to extract string from result html: ' . $s);
        }

        return $matches[2];
    }

    /**
     * Get HTML fragment (the contents of the <body>,
     * without the actual <body> tag
     *
     * @return string XML string
     * It may be different from the input html because
     * <br> will be replaced with <br/> and may be other
     * changed to make it a valid XML string
     *
     */
    public function getXML() {
        $this->preserveWhiteSpace = false;
        $this->documentElement->removeWhitespace();
        $s = $this->saveXML();
        preg_match('/(\<body>\<div>)(.*)(\<\/div>\<\/body>)/sm', $s, $matches);

        if (!is_array($matches) || empty($matches[2])) {
            throw new \StudyBuddy\Exception('unable to extract string from result html: ' . $s);
        }

        return $matches[2];
    }

    /**
     * Get only the text from this document.
     * It will basically strip all the tags and return only
     * text value of tags
     *
     * @return string plaintext
     */
    public function getText() {

        return $this->getElementsByTagName('body')->item(0)->textContent;
    }

    /**
     * Get the length of all text in this document,
     * not counting any of the html tags
     *
     * @return int length of text content
     */
    public function length() {

        return \mb_strlen($this->getText());
    }

    /**
     * Get all text nodes of this HTML string
     * 
     * @return object of type DOMNodeList
     */
    public function getTextNodes() {
        return $this->xpath('//text()');
    }

    /**
     * Get count of words in this html document
     * This is the right way to get word count
     * from HTML doc. The simple way of strip_tags and
     * then explode by spaces will not work if
     * html string is just one long
     * string run together without white spaces
     * and using regex is usually not the best way
     * to deal with html string.
     *
     * Each Text Node element is then treated
     * as separate UTF8String object
     *
     * This way each text node is split by UTF-8 specific word
     * delimeters, making it return correct word count
     * for Any type of language (not only splitting by spaces but
     * by other accepted delimiters)
     *
     * The resulting word count will be accurate for arabic, chinese,
     * and probably all other languages
     *
     * @return int count of words in this html string
     */
    public function getWordsCount() {
        $count = 0;
        $Nodes = $this->getTextNodes();
        $len = $Nodes->length;
        if (!$Nodes || 0 === $len) {

            return 0;
        }

        for ($i = 0; $i < $len; $i += 1) {
            $UTF8String = Utf8String::factory($Nodes->item($i)->data, 'utf-8', true);
            $count += $UTF8String->getWordsCount();
        }

        return $count;
    }

    /**
     *
     * Reloads the html into this object
     * This is useful to turn all the CDATA
     * sections into the actual DOM tree
     *
     * @return object $this
     */
    public function reload() {
        @$this->loadHTML($this->saveHTML());

        return $this;
    }

    /**
     * Reload document only if CData has been
     * added anywhere in the document
     * This basically imports contens of CDATA section
     * into the DOM Tree so it's not just a string anymore
     * but a part of DOM
     *
     * @return object $this
     */
    public function importCDATA() {
        if ($this->bCDATA) {
            $this->reload();
            $this->bCDATA = false;
        }

        return $this;
    }

    /**
     *
     * @return bool true if this object
     * has CDATA section added by one of the
     * methods, false otherwise
     */
    public function hasCDATA() {

        return $this->bCDATA;
    }

    /**
     * @Important to override the one from parent
     * because parent's class returns the saveXML() version
     * and here we need HTML
     * (non-PHPdoc)
     * @see StudyBuddy\Interfaces.StudyBuddyObject::__toString()
     */
    public function __toString() {
        return $this->getHtml();
    }

    /**
     * Same as __toString(), just for
     * consistency with our String class
     *
     * @return string html contents of this String
     */
    public function valueOf() {
        return $this->getHTML();
    }

    /**
     * (non-PHPdoc)
     * @see Serializable::serialize()
     */
    public function serialize() {
        return $this->saveHTML();
    }

    /**
     * (non-PHPdoc)
     * @see Serializable::unserialize()
     */
    public function unserialize($serialized) {
        $this->loadHTML($serialized);
        $this->encoding = 'UTF-8';
        $this->preserveWhiteSpace = true;
        $this->registerNodeClass('DOMElement', '\StudyBuddy\Dom\Element');
    }

}
